Create a histogram that shows the distribution of characters in a string. Use this function to draw a bar chart for the letters in your first name.
library(ggplot2)
string = "RAJKANWAR"
char_count = data.frame(table(strsplit(string, "")[[1]]))
colnames(char_count) = c("char", "count")
ggplot(char_count, aes(x = char, y = count)) +
geom_bar(stat = "identity", aes(fill = char)) +
scale_fill_brewer(palette = "Set1") +
ggtitle("Distribution of characters in first name")
Dendogram on US Arrests
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(ggdendro)
hc1 <- hclust(dist(USArrests), "ave")
p1 <- ggdendrogram(hc1, rotate = FALSE, size = 2)
ggplotly(p1)
Draw the linked view for Cloud multivariate dataset (available in UCI repository).
# Load the library for data visualization
library(ggplot2)
# Load the cloud dataset from UCI repository
idata <- read.csv("cloud.csv", header = TRUE)
# Plot the linked view for the multivariate dataset using ggplot2
ggplot(data = idata, aes(x = Visible_mean, y = IR_mean, color = contrast)) +
geom_point() +
geom_smooth(method = "lm") +
xlab("Visible Mean") +
ylab("IR Mean") +
ggtitle("Linked View for Cloud Multivariate Dataset: Visible VS IR Mean")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Draw a graph matrix for Image Segmentation multivariate dataset
(available in UCI repository).
data <- read.csv("segmentation.csv", header = TRUE)
# Next, we'll use k-means clustering to group the data into clusters
library(stats)
kmeans_result <- kmeans(data, centers = 5)
clusters <- kmeans_result$cluster
# We'll use the 'ggplot2' package to plot the graph matrix
library(ggplot2)
ggplot(data, aes(x = 1, y = 2, color = as.factor(clusters))) +
geom_point(size = 3) +
scale_color_discrete(name = "Cluster") +
xlab("") + ylab("") +
ggtitle("Graph Matrix using K-Means Clustering")
Draw exploratory graphics of a stock market Dataset.
library(ggplot2)
# Load the dataset
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data <- tq_get("AAPL", get = "stock.prices")
# Plot a scatter plot of the closing price and volume
ggplot(data, aes(x = close, y = volume)) +
geom_point(color = "red") +
ggtitle("Scatter Plot of Closing Price and Volume")
# Plot a line graph of the closing price over time
ggplot(data, aes(x = date, y = close)) +
geom_line(color = "blue") +
ggtitle("Closing Price of Apple Stock Over Time")
# Plot a histogram of the daily return
ggplot(data, aes(x = adjusted)) +
geom_histogram(binwidth = 1, fill = "green") +
ggtitle("Histogram of Daily Returns")
US Arrests K-means clustering
library(ggpubr)
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the USArrests dataset
data("USArrests")
df <- USArrests
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(df), 3, nstart = 25)
# K-means clusters showing the group of each state
res.km$cluster
## Alabama Alaska Arizona Arkansas California
## 1 1 1 3 1
## Colorado Connecticut Delaware Florida Georgia
## 1 3 3 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 1 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
fviz_cluster(res.km, data = df,
palette = c("#E69F00", "#56B4E9", "#009E73"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw()
)
# Dimension reduction using PCA
res.pca <- prcomp(df, scale = TRUE)
# Coordinates of states
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
Visualize k-nearest-neighbor search, on Ranking in spatial dataset, using D3 quadtrees.
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
library(caTools)
library(class)
library(ggplot2)
data(iris)
head(iris)
# Split data
split <- sample.split(iris, SplitRatio = 0.7)
train_cl <- subset(iris, split == "TRUE")
test_cl <- subset(iris, split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 1:4])
test_scale <- scale(test_cl[, 1:4])
# Fitting KNN Model to training dataset
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 1)
classifier_knn
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa setosa setosa setosa
## [19] setosa setosa versicolor virginica versicolor versicolor
## [25] versicolor versicolor versicolor versicolor virginica versicolor
## [31] virginica versicolor versicolor versicolor versicolor versicolor
## [37] versicolor versicolor versicolor versicolor virginica virginica
## [43] virginica virginica virginica virginica virginica versicolor
## [49] virginica virginica virginica virginica virginica versicolor
## [55] virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica
# Confusiin Matrix
cm <- table(test_cl$Species, classifier_knn)
cm
## classifier_knn
## setosa versicolor virginica
## setosa 20 0 0
## versicolor 0 17 3
## virginica 0 2 18
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.916666666666667"
# K = 3
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 3)
misClassError <- mean(classifier_knn != test_cl$Species)
# K = 5
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 5)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 7
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 7)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 15
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 15)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 19
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 19)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
#Visiualisation
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
geom_point(size = 3) +
ggtitle("Iris Data Set") +
labs(x = "Petal Length", y = "Petal Width", color = "Species") +
theme_bw() +
geom_point(data = test_cl, aes(Petal.Length, Petal.Width, color = classifier_knn), size = 15, shape = 1)
Implement data visualization using dendrogram.
# Load data
data(mtcars)
# Compute distances and hierarchical clustering
dd <- dist(scale(mtcars), method = "euclidean")
#Ward_D2 means Instead of measuring the distance directly,
#it analyzes the variance of clusters
hc <- hclust(dd, method = "ward.D2")
library(factoextra)
fviz_dend(hc, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(hc, cex = 0.5,
main = "Dendrogram - ward.D2",
xlab = "Objects", ylab = "Distance", sub = "")
#fviz_dend(hc, cex = 0.5, horiz = TRUE)
fviz_dend(hc, k = 4, cex=0.5, k_colors = c("blue", "green", "red", "black"), # Cut in four groups
color_labels_by_k = TRUE, ggtheme = theme_gray() )
Visualize functional data with an application to eBay’s online
auctions.
# Load required packages
library(fda)
## Loading required package: splines
## Loading required package: fds
## Loading required package: rainbow
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## Loading required package: pcaPP
## Loading required package: RCurl
## Loading required package: deSolve
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
library(fda.usc)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
## fda.usc is running sequentially usign foreach package
## Please, execute ops.fda.usc() once to run in local parallel mode
## Deprecated functions: min.basis, min.np, anova.hetero, anova.onefactor, anova.RPm
## New functions: optim.basis, optim.np, fanova.hetero, fanova.onefactor, fanova.RPm
## ----------------------------------------------------------------------------------
# Load Shill Bidding Dataset
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill%20Bidding%20Dataset.csv"
shill <- read.csv(url, header = TRUE)
# Drop non-relevant variables
shill <- shill[, c("Auction_Bids", "Auction_Duration")]
# Sort the data by Auction_Bids
shill <- shill[order(shill$Auction_Bids),]
# Create a functional data object from the Auction_Duration variable using B-splines
basis <- create.bspline.basis(rangeval = range(shill$Auction_Bids), nbasis = 10)
fd_shill <- smooth.basis(shill$Auction_Bids, shill$Auction_Duration, basis)
# Plot the functional data object
plot(fd_shill, xlab = "Number of Bids", ylab = "Auction Duration",
main = "Relationship between Bids and Auction Duration in eBay Auctions",
col = "red", lwd = 2)
## [1] "done"
Show graphical data representation in classification using Iris dataset
# Load required libraries
library(ggplot2)
library(datasets)
library(reshape2)
# Load iris dataset
data(iris)
# Define custom color palette
my_colors <- c("#E69F00", "#56B4E9", "#009E73")
# Scatterplot with regression line
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(color = my_colors[1]) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Sepal Length", y = "Sepal Width", title = "Iris Dataset with Regression Line") +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
# Boxplot of Sepal length by Species
ggplot(iris, aes(Species, Sepal.Length)) +
geom_boxplot(fill = my_colors[2]) +
labs(x = "Species", y = "Sepal Length", title = "Iris Dataset: Sepal Length by Species") +
theme_classic()
# Histogram of Petal width by Species
ggplot(iris, aes(Petal.Width, fill = Species)) +
geom_histogram(alpha = 0.5, bins = 30) +
scale_fill_manual(values = my_colors) +
labs(x = "Petal Width", y = "Count", title = "Iris Dataset: Petal Width by Species") +
theme_classic()
# Stacked bar chart of Petal length by Species
ggplot(iris, aes(Species, Petal.Length, fill = Species)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = my_colors) +
labs(x = "Species", y = "Petal Length", title = "Iris Dataset: Petal Length by Species") +
theme_classic()
# Heatmap of correlations between variables in iris dataset
iris_cor <- round(cor(iris[,1:4]), 2) # Calculate correlation matrix
ggplot(data = melt(iris_cor), aes(Var2, Var1, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = my_colors[1], high = my_colors[3], mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
labs(x = "", y = "", title =
"Correlations between variables in Iris dataset")
Draw a graph matrix for mushroom dataset (available in UCI
repository).
# Load the necessary packages
library(readr)
# Load the mushroom dataset
mushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
col_names = FALSE)
## Rows: 8124 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): X1, X2, X3, X4, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
## lgl (1): X5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Draw the graph matrix
ggpairs(mushrooms, aes(colour = mushrooms$X1))